In [1]:
#import sys
#sys.path.append('/home/jupyter/site-packages/')
from IPython.display import SVG, display
import spacy
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
%matplotlib inline
In [2]:
nlp = spacy.load('en')
In [4]:
import pandas as pd
def info(obj):
return {'type':type(obj),'__str__': str(obj)}
text = u"""spaCy excels at large-scale information extraction tasks.
It's written from the ground up in carefully memory-managed Cython. """
document = nlp(text)
token = document[0]
span = document[0:3]
pd.DataFrame(list(map(info, [token, span, document])))
Out[4]:
In [5]:
print(document.sents)
for sent in document.sents:
print(sent)
In [8]:
for i, token in enumerate(document):
print('%3d: "%s"' % (i, token))
In [9]:
### Morphological decomposition
In [12]:
token = document[13]
print("text: %s" % token.text)
print("suffix: %s" % token.suffix_)
print("lemma: %s" % token.lemma_)
In [13]:
### Part of Speech Tagging
In [16]:
#Part of speech and Dependency tagging
attrs = map(lambda token: {
"token": token,
"part of speech": token.pos_,
"Dependency": token.dep_},
document)
pd.DataFrame(list(attrs))
Out[16]:
In [17]:
### Noun Chunking
In [20]:
print("noun chunks: %s" % list(document.noun_chunks))
In [ ]:
### Named Entity Recognition
In [21]:
ents = [(ent, ent.root.ent_type_) for ent in document.ents]
print("entities: %s" % ents)
In [ ]:
### Text Similarity (Using Word Vectors)
In [30]:
#document, span, and token similarity
def plot_similarities(doc, similarities, target):
import matplotlib.pyplot as plt
%matplotlib inline
f, ax = plt.subplots(1)
index = range(len(similarities))
ax.barh(index, similarities)
ax.set_yticks(index)
ax.set_yticklabels(doc)
ax.grid(axis='x')
ax.set_title("Similarity to '{}'".format(target))
plt.show()
computer = nlp(u'computer')
document2 = nlp(u'You might be using a machine running Windows')
similarities = list(map(lambda token: token.similarity(computer), document2))
document2_r = list(reversed(document2))
similarities_r = list(reversed(similarities))
plot_similarities(document2_r, similarities_r, computer)
In [ ]:
In [ ]:
In [ ]:
In [ ]: